Leo Lu
2018-03-05
呂奕 Leo Lu
台大工管
目前於金融業服務
在 x-y 二維的 Scatterplot 加入第三個 aesthetic
?geom_point: 查詢支援的 aestheticsggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, color = class))ggplot(data = <DATA>) + # Data
geom_<xxx>(
mapping = aes(<MAPPINGS>), ## <= Aesthetic mappings
stat = <STAT>,
position = <POSITION>
) +
scale_<xxx>() + coord_<xxx>() + facet_<xxx>()
theme_()aes() 可以放在:
ggplot()裡面 – 有+ aes() – 有geom_<xxx>()裡面 – 無“記憶效果”(只對該 geom 有效)geom_<xxx>(inherit.aes=FALSE): overrides the default aesthetics.有時候你可能只想要手動設定某個
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), color = "blue")library(gapminder)
gapminder#> # A tibble: 1,704 x 6
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Afghanistan Asia 1952 28.8 8425333 779.
#> 2 Afghanistan Asia 1957 30.3 9240934 821.
#> 3 Afghanistan Asia 1962 32.0 10267083 853.
#> 4 Afghanistan Asia 1967 34.0 11537966 836.
#> 5 Afghanistan Asia 1972 36.1 13079460 740.
#> 6 Afghanistan Asia 1977 38.4 14880372 786.
#> 7 Afghanistan Asia 1982 39.9 12881816 978.
#> 8 Afghanistan Asia 1987 40.8 13867957 852.
#> 9 Afghanistan Asia 1992 41.7 16317921 649.
#> 10 Afghanistan Asia 1997 41.8 22227415 635.
#> # ... with 1,694 more rows
tidyr)purrr)broom)library(dplyr)
library(tidyr)
gapminder <- gapminder %>% mutate(year1950 = year - 1950)
by_country <- gapminder %>%
group_by(continent, country) %>%
nest
by_country#> # A tibble: 142 x 3
#> continent country data
#> <fct> <fct> <list>
#> 1 Asia Afghanistan <tibble [12 × 5]>
#> 2 Europe Albania <tibble [12 × 5]>
#> 3 Africa Algeria <tibble [12 × 5]>
#> 4 Africa Angola <tibble [12 × 5]>
#> 5 Americas Argentina <tibble [12 × 5]>
#> 6 Oceania Australia <tibble [12 × 5]>
#> 7 Europe Austria <tibble [12 × 5]>
#> 8 Asia Bahrain <tibble [12 × 5]>
#> 9 Asia Bangladesh <tibble [12 × 5]>
#> 10 Europe Belgium <tibble [12 × 5]>
#> # ... with 132 more rows
lm(lifeExp ~ year, data = Afghanistan)
lm(lifeExp ~ year, data = Afghanistan)
...library(purrr)
country_model <- function(df) {
lm(lifeExp ~ year1950, data = df)
}
models <- by_country %>%
mutate(mod_lm = map(data, country_model))
models#> # A tibble: 142 x 4
#> continent country data mod_lm
#> <fct> <fct> <list> <list>
#> 1 Asia Afghanistan <tibble [12 × 5]> <S3: lm>
#> 2 Europe Albania <tibble [12 × 5]> <S3: lm>
#> 3 Africa Algeria <tibble [12 × 5]> <S3: lm>
#> 4 Africa Angola <tibble [12 × 5]> <S3: lm>
#> 5 Americas Argentina <tibble [12 × 5]> <S3: lm>
#> 6 Oceania Australia <tibble [12 × 5]> <S3: lm>
#> 7 Europe Austria <tibble [12 × 5]> <S3: lm>
#> 8 Asia Bahrain <tibble [12 × 5]> <S3: lm>
#> 9 Asia Bangladesh <tibble [12 × 5]> <S3: lm>
#> 10 Europe Belgium <tibble [12 × 5]> <S3: lm>
#> # ... with 132 more rows
models <- models %>%
mutate(
glance = mod_lm %>% map(broom::glance),
tidy = mod_lm %>% map(broom::tidy),
augment = mod_lm %>% map(broom::augment),
rsq = glance %>% map_dbl("r.squared")
)
models#> # A tibble: 142 x 8
#> continent country data mod_lm glance tidy augment rsq
#> <fct> <fct> <list> <list> <list> <list> <list> <dbl>
#> 1 Asia Afghanistan <tibble… <S3: l… <data.f… <data.… <data.fr… 0.948
#> 2 Europe Albania <tibble… <S3: l… <data.f… <data.… <data.fr… 0.911
#> 3 Africa Algeria <tibble… <S3: l… <data.f… <data.… <data.fr… 0.985
#> 4 Africa Angola <tibble… <S3: l… <data.f… <data.… <data.fr… 0.888
#> 5 Americas Argentina <tibble… <S3: l… <data.f… <data.… <data.fr… 0.996
#> 6 Oceania Australia <tibble… <S3: l… <data.f… <data.… <data.fr… 0.980
#> 7 Europe Austria <tibble… <S3: l… <data.f… <data.… <data.fr… 0.992
#> 8 Asia Bahrain <tibble… <S3: l… <data.f… <data.… <data.fr… 0.967
#> 9 Asia Bangladesh <tibble… <S3: l… <data.f… <data.… <data.fr… 0.989
#> 10 Europe Belgium <tibble… <S3: l… <data.f… <data.… <data.fr… 0.995
#> # ... with 132 more rows
models %>%
ggplot(aes(rsq, reorder(country, rsq))) + # use factor levels
geom_point(aes(colour = continent))models %>% unnest(data)#> # A tibble: 1,704 x 8
#> continent country rsq year lifeExp pop gdpPercap year1950
#> <fct> <fct> <dbl> <int> <dbl> <int> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 1952 28.8 8425333 779. 2.
#> 2 Asia Afghanistan 0.948 1957 30.3 9240934 821. 7.
#> 3 Asia Afghanistan 0.948 1962 32.0 10267083 853. 12.
#> 4 Asia Afghanistan 0.948 1967 34.0 11537966 836. 17.
#> 5 Asia Afghanistan 0.948 1972 36.1 13079460 740. 22.
#> 6 Asia Afghanistan 0.948 1977 38.4 14880372 786. 27.
#> 7 Asia Afghanistan 0.948 1982 39.9 12881816 978. 32.
#> 8 Asia Afghanistan 0.948 1987 40.8 13867957 852. 37.
#> 9 Asia Afghanistan 0.948 1992 41.7 16317921 649. 42.
#> 10 Asia Afghanistan 0.948 1997 41.8 22227415 635. 47.
#> # ... with 1,694 more rows
models %>% unnest(glance, .drop = TRUE)#> # A tibble: 142 x 14
#> continent country rsq r.squared adj.r.squared sigma statistic
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 0.948 0.942 1.22 181.
#> 2 Europe Albania 0.911 0.911 0.902 1.98 102.
#> 3 Africa Algeria 0.985 0.985 0.984 1.32 662.
#> 4 Africa Angola 0.888 0.888 0.877 1.41 79.1
#> 5 Americas Argentina 0.996 0.996 0.995 0.292 2246.
#> 6 Oceania Australia 0.980 0.980 0.978 0.621 481.
#> 7 Europe Austria 0.992 0.992 0.991 0.407 1261.
#> 8 Asia Bahrain 0.967 0.967 0.963 1.64 291.
#> 9 Asia Bangladesh 0.989 0.989 0.988 0.977 930.
#> 10 Europe Belgium 0.995 0.995 0.994 0.293 1822.
#> # ... with 132 more rows, and 7 more variables: p.value <dbl>, df <int>,
#> # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>
models %>% unnest(tidy, .drop = TRUE)#> # A tibble: 284 x 8
#> continent country rsq term estimate std.error statistic p.value
#> <fct> <fct> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Asia Afghanistan 0.948 (Int… 29.4 0.699 42.0 1.40e-12
#> 2 Asia Afghanistan 0.948 year… 0.275 0.0205 13.5 9.84e- 8
#> 3 Europe Albania 0.911 (Int… 58.6 1.13 51.7 1.79e-13
#> 4 Europe Albania 0.911 year… 0.335 0.0332 10.1 1.46e- 6
#> 5 Africa Algeria 0.985 (Int… 42.2 0.756 55.8 8.22e-14
#> 6 Africa Algeria 0.985 year… 0.569 0.0221 25.7 1.81e-10
#> 7 Africa Angola 0.888 (Int… 31.7 0.804 39.4 2.63e-12
#> 8 Africa Angola 0.888 year… 0.209 0.0235 8.90 4.59e- 6
#> 9 Americas Argentina 0.996 (Int… 62.2 0.167 372. 4.80e-22
#> 10 Americas Argentina 0.996 year… 0.232 0.00489 47.4 4.22e-13
#> # ... with 274 more rows
models %>%
unnest(tidy) %>%
select(continent, country, term, estimate, rsq) %>%
spread(key = term, value = estimate) %>%
ggplot(aes(`(Intercept)`, year1950)) +
geom_point(aes(colour = continent, size = rsq)) +
geom_smooth(se = FALSE)models %>%
unnest(augment) %>%
ggplot(aes(year1950, .resid)) +
geom_line(aes(group = country), alpha = 0.3) +
geom_smooth(se = FALSE) +
geom_hline(yintercept = 0, colour = "red", alpha = 0.7) +
facet_wrap(~continent)tidyr: 把物件 (例如 lm) 用 list 存在 columns 裡面purrr: Functional programmingbroom: Models → tidy datalibrary(ggplot2)
## data
set.seed(2016)
grp <- gl(n=4, k=20, labels=c("group a","group b","group c", "group d"))
value <- runif(n=80, min=10, max=150)
outcome <- cut(value,2)
(data <- data.frame(grp,value,outcome))#> grp value outcome
#> 1 group a 35.22290 (10.2,76.3]
#> 2 group a 30.01212 (10.2,76.3]
#> 3 group a 127.83051 (76.3,142]
#> 4 group a 28.70045 (10.2,76.3]
#> 5 group a 76.85034 (76.3,142]
#> 6 group a 26.97618 (10.2,76.3]
#> 7 group a 96.32838 (76.3,142]
#> 8 group a 134.67666 (76.3,142]
#> 9 group a 10.36728 (10.2,76.3]
#> 10 group a 17.48423 (10.2,76.3]
#> 11 group a 64.41631 (10.2,76.3]
#> 12 group a 48.21350 (10.2,76.3]
#> 13 group a 41.16120 (10.2,76.3]
#> 14 group a 132.72041 (76.3,142]
#> 15 group a 44.53364 (10.2,76.3]
#> 16 group a 65.06026 (10.2,76.3]
#> 17 group a 100.05053 (76.3,142]
#> 18 group a 28.45157 (10.2,76.3]
#> 19 group a 90.16397 (76.3,142]
#> 20 group a 22.45060 (10.2,76.3]
#> 21 group b 56.60950 (10.2,76.3]
#> 22 group b 105.97659 (76.3,142]
#> 23 group b 64.47677 (10.2,76.3]
#> 24 group b 88.47872 (76.3,142]
#> 25 group b 137.36254 (76.3,142]
#> 26 group b 93.86846 (76.3,142]
#> 27 group b 35.56070 (10.2,76.3]
#> 28 group b 95.71090 (76.3,142]
#> 29 group b 142.24888 (76.3,142]
#> 30 group b 70.01091 (10.2,76.3]
#> 31 group b 98.30474 (76.3,142]
#> 32 group b 124.95707 (76.3,142]
#> 33 group b 36.63055 (10.2,76.3]
#> 34 group b 133.01564 (76.3,142]
#> 35 group b 129.15391 (76.3,142]
#> 36 group b 30.68462 (10.2,76.3]
#> 37 group b 43.79335 (10.2,76.3]
#> 38 group b 49.33383 (10.2,76.3]
#> 39 group b 43.93878 (10.2,76.3]
#> 40 group b 104.30316 (76.3,142]
#> 41 group c 57.25786 (10.2,76.3]
#> 42 group c 23.86566 (10.2,76.3]
#> 43 group c 45.88386 (10.2,76.3]
#> 44 group c 103.77243 (76.3,142]
#> 45 group c 62.16864 (10.2,76.3]
#> 46 group c 48.48868 (10.2,76.3]
#> 47 group c 111.38763 (76.3,142]
#> 48 group c 53.71373 (10.2,76.3]
#> 49 group c 89.98226 (76.3,142]
#> 50 group c 77.86725 (76.3,142]
#> 51 group c 32.09404 (10.2,76.3]
#> 52 group c 68.03531 (10.2,76.3]
#> 53 group c 46.23794 (10.2,76.3]
#> 54 group c 62.36134 (10.2,76.3]
#> 55 group c 137.55317 (76.3,142]
#> 56 group c 79.68930 (76.3,142]
#> 57 group c 131.94144 (76.3,142]
#> 58 group c 18.01237 (10.2,76.3]
#> 59 group c 29.73031 (10.2,76.3]
#> 60 group c 63.13070 (10.2,76.3]
#> 61 group d 32.43695 (10.2,76.3]
#> 62 group d 80.13010 (76.3,142]
#> 63 group d 75.26836 (10.2,76.3]
#> 64 group d 34.06693 (10.2,76.3]
#> 65 group d 134.69061 (76.3,142]
#> 66 group d 130.04567 (76.3,142]
#> 67 group d 19.32048 (10.2,76.3]
#> 68 group d 23.05680 (10.2,76.3]
#> 69 group d 82.97590 (76.3,142]
#> 70 group d 133.07933 (76.3,142]
#> 71 group d 137.68230 (76.3,142]
#> 72 group d 142.15229 (76.3,142]
#> 73 group d 38.12144 (10.2,76.3]
#> 74 group d 106.15707 (76.3,142]
#> 75 group d 39.43386 (10.2,76.3]
#> 76 group d 19.20592 (10.2,76.3]
#> 77 group d 53.36572 (10.2,76.3]
#> 78 group d 132.45927 (76.3,142]
#> 79 group d 19.52064 (10.2,76.3]
#> 80 group d 33.12068 (10.2,76.3]
## Origianl plot
ggplot(data, aes(grp, fill=outcome)) +
geom_bar() +
xlab("group") +
ylab("number of subjects") +
labs(fill = "Serologic response")# Change the displayed labels for groups
# The standard way is to use the scale functions to change the displayed labels for groups. You can replace your ggplot call with
ggplot(data, aes(grp, fill=outcome)) +
geom_bar() +
xlab("group") +
ylab("number of subjects") +
scale_fill_discrete(name="Serologic response",
breaks=levels(data$outcome),
labels=c("double negative", "positive for a and/or b"))# Note that the scale's title has been incorporated into the scale_fill_discrete call. You can do this with the "axes" too, if you like
ggplot(data, aes(grp, fill=outcome)) +
geom_bar() +
scale_x_discrete("group") + # x-axis name
scale_y_continuous("number of subjects") + # y-axis name
scale_fill_discrete(name="Serologic response",
breaks=levels(data$outcome),
labels=c("double negative", "positive for a and/or b"))沒有排序的 bar chart 很難看
reorder(<要排序的變數>, <參照大小>)d <- mpg %>%
group_by(class) %>%
summarise(n = n())
d
#> # A tibble: 7 x 2
#> class n
#> <chr> <int>
#> 1 2seater 5
#> 2 compact 47
#> 3 midsize 41
#> 4 minivan 11
#> 5 pickup 33
#> 6 subcompact 35
#> 7 suv 62
ggplot(data = d) +
geom_bar(mapping = aes(x = reorder(class, -n), y = n),
stat = "identity")